import pandas as pd
import json
from collections import Counter
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
import account_util as ut
df = pd.read_csv('../tweets_provax.csv',low_memory=False,
usecols=['user_id','user_created_at','user_screen_name','user_mentions','created_at',
'user_verified','rt_created_at','in_reply_to_screen_name','rt_user_id','rt_user_screen_name',
'is_self_rt','user_url_cred'])
df['user_created_at'] = pd.to_datetime(df['user_created_at'], format="%a %b %d %X %z %Y")
df['created_at'] = pd.to_datetime(df['created_at'], format="%a %b %d %X %z %Y")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 840271 entries, 0 to 840270 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 created_at 840271 non-null datetime64[ns, UTC] 1 user_id 840271 non-null int64 2 user_screen_name 840271 non-null object 3 user_verified 840271 non-null bool 4 user_created_at 840271 non-null datetime64[ns, UTC] 5 in_reply_to_screen_name 91966 non-null object 6 rt_created_at 609777 non-null object 7 rt_user_id 609777 non-null float64 8 rt_user_screen_name 609777 non-null object 9 user_mentions 840271 non-null object 10 is_self_rt 840271 non-null bool 11 user_url_cred 318493 non-null float64 dtypes: bool(2), datetime64[ns, UTC](2), float64(2), int64(1), object(5) memory usage: 65.7+ MB
dfAccount = pd.DataFrame()
df1 = df.groupby(['user_screen_name']).sum()
df1.loc[:,df1.columns[df1.columns.str.contains('id', regex=False)==False]]
dfAccount['user_verified'] = df1['user_verified']>0
dfAccount['created_at'] = df.groupby('user_screen_name').first()['user_created_at']
#How many users verify the account (absolute)
values = [sum(dfAccount['user_verified']), len(dfAccount) - sum(dfAccount['user_verified'])]
names = ['Yes', 'No']
fig = px.pie(values=values, names=names)
fig.update_layout(title="Is the user verified?")
fig.show()
len(dfAccount)
10464
dfCountUserCreation = dfAccount.resample('W', on='created_at').count().iloc[:,1].to_frame()
dfCountUserCreation.rename(columns={"created_at": "count"},inplace=True)
fig = px.histogram(dfCountUserCreation,x=dfCountUserCreation.index,y='count',
title='User creation distribution',nbins=100)
fig.update_yaxes(title='count')
fig.update_xaxes(title='date')
fig.show()
Dal grafico sovrastante, si possono notare due picchi nel 2012 e nel 2020 importanti e una crescita importante del numero di profili creati nell 2009. La prima crescita, che risale appunto al 2009, è dovuta all'introduzione (insieme a francese, tedesco e spagnolo) dell'italiano, come lingua attiva.
Dopodiché, nel 2012, twitter ha per la prima volta battuto una notizia di rilevanza istituzionale con largo anticipo rispetto ai media tradizionali: la morte del presidente emerito Oscar Luigi Scalfaro. Questo ha portato ad un aumento, sopratutto in Italia delle iscrizioni a Twitter.
Infine, verso fine marzo 2020, soprattutto in Italia, vi è stato un ferreo lockdown a causa del Covid. Questo lockdown, che ha costretto tutti a casa, ha portato ad un aumento di iscrizioni ai social, quindi anche Twitter,per riempire i momenti di noia.
#When the most user are created (since covid-19 started)
date = "2019-11-17" # 2020-11-17 --> The first case of infection ascertained by COVID-19 is recorded
df1 = dfAccount[(dfAccount['created_at']> date)]
dfCountUserCreation = df1.resample('W', on='created_at').count().iloc[:,1].to_frame()
dfCountUserCreation.rename(columns={"created_at": "count"},inplace=True)
fig = px.histogram(dfCountUserCreation,x=dfCountUserCreation.index,y='count',
title='User cration distribution (since covid-19 started)',nbins=100)
fig.update_yaxes(title='count')
fig.update_xaxes(title='date')
fig.show()
Concentrando la distribuzione dall primo caso di covid-19 registrato si possono notare tre picchi interessanti, a marzo 2020 (già notato nel grafico precedente), a novembre 2020 e dicembre 2020. I nuovi picchi che si possono notare, a differenza di quello già notato a marzo, riguarda una specifica settimana.
I picchi settimanali di novembre e dicembre, accadono propio in due momenti importanti importanti in italia, il primo, quello di novembre, capita durante la settimana del 3 novembre dove viene istituito un coprifuoco dalle 22:00 alle 05:00 per tutta l'Italia. Il secondo, invece, poco prima delle feste natalizie, quando, il governo ha istituito zona rossa nazionale.
In questi periodi, come per marzo, le persone ritrovandosi senza nulla da fare, hanno visto nei social come Twitter un passatempo e uno sfogo.
with open('../1_Dataset_preparation/listControlledUsers.json','r') as file_object:
data = json.load(file_object)
listNovax = data['Novax']
listLinkLow = data['link_low']
listProvax = data['Provax']
#Perparation i df grouped by name
dfTweetByName = pd.DataFrame(df.loc[:,'user_screen_name'])
dfTweetByName = ut.get_df_raggruped(dfTweetByName,'tweet_count','user_screen_name')
dfTweetByName = ut.add_user_type(dfTweetByName,listNovax,listProvax,listLinkLow)
dfTweetByName
| tweet_count | user_type | |
|---|---|---|
| user_screen_name | ||
| danieledv79 | 3012 | Not defined |
| paoloigna1 | 2894 | Not defined |
| smilypapiking | 2823 | Not defined |
| Marilenapas | 2511 | Not defined |
| Profilo3Marco | 2152 | Not defined |
| ... | ... | ... |
| alexiula | 10 | Not defined |
| erregi20 | 10 | Not defined |
| ermanno32947318 | 10 | Not defined |
| BarbaraGiorgi | 10 | Not defined |
| bachsblueten | 10 | Not defined |
10464 rows × 2 columns
#Show the most active user (in general)
for i in (10,20,40,50):
ut.print_histogram_users(dfTweetByName,i,'tweet_count','Most %d active users'%i,'Count of tweets')
# get top 20 most frequent Tweet account
df_tweets = df[df['in_reply_to_screen_name'].isna()]
df_tweets = df_tweets[df_tweets['rt_created_at'].isna()]
df_tweets = pd.DataFrame(df_tweets.loc[:,'user_screen_name'])
df_tweets = ut.get_df_raggruped(df_tweets,'tweet_count','user_screen_name')
dfTweetByName = ut.add_user_type(df_tweets,listNovax,listProvax,listLinkLow)
df_tweets
| tweet_count | user_type | |
|---|---|---|
| user_screen_name | ||
| Adnkronos | 1843 | Provax |
| repubblica | 1745 | Provax |
| paoloigna1 | 1477 | Not defined |
| Corriere | 1324 | Provax |
| La7tv | 1323 | Provax |
| ... | ... | ... |
| RosaMaiuccaro | 1 | Not defined |
| Ronozoico | 1 | Not defined |
| GGiotar | 1 | Not defined |
| giovit1961 | 1 | Not defined |
| LucaGiord74 | 1 | Not defined |
6948 rows × 2 columns
ut.print_histogram_users(df_tweets,20,'tweet_count','Most %d users that create posts'%20,'Count of tweets')
df_retweet = pd.DataFrame()
df_retweet['all_rt'] = df[df['rt_created_at'].notna()].groupby('rt_user_screen_name').count()['user_id']
df_retweet['self_rt'] = df[df['rt_created_at'].notna()].groupby('rt_user_screen_name').sum()['is_self_rt']
df_retweet['real_rt'] = df_retweet['all_rt'] - df_retweet['self_rt']
df_retweet.sort_values('real_rt',ascending=False,inplace=True)
df_retweet = ut.add_user_type(df_retweet,listNovax,listProvax,listLinkLow)
df_retweet
| all_rt | self_rt | real_rt | user_type | |
|---|---|---|---|---|
| rt_user_screen_name | ||||
| RobertoBurioni | 46522 | 0 | 46522 | Provax |
| Cartabellotta | 13343 | 263 | 13080 | Provax |
| Agenzia_Ansa | 8772 | 1 | 8771 | Link low credibility |
| lucianocapone | 8438 | 5 | 8433 | Provax |
| fattoquotidiano | 8171 | 1 | 8170 | Provax |
| ... | ... | ... | ... | ... |
| Aldosavar | 4 | 4 | 0 | Not defined |
| FilippoBuccella | 1 | 1 | 0 | Not defined |
| aferraricislit2 | 1 | 1 | 0 | Not defined |
| rosamar07455781 | 1 | 1 | 0 | Not defined |
| 80Ivi | 1 | 1 | 0 | Not defined |
19459 rows × 4 columns
for i in (10,20,40,50):
ut.print_histogram_users(df_retweet,i,'real_rt','Most %d retweeted users'%i,'Count of retweets')
for i in (10,20,40,50):
fig = make_subplots(rows=1, cols=1)
if i <= 20:
fig.add_trace(go.Bar(y=df_retweet.head(i).index, x=df_retweet.head(i)['real_rt'],orientation='h',
name = 'All retweet',marker_color='#636EFA'), row=1, col=1)
fig.add_trace(go.Bar(y=df_retweet.head(i).index, x=df_retweet.head(i)['self_rt'],orientation='h',
name = 'Self retweet',marker_color='#EF553B'), row=1, col=1)
else:
fig = make_subplots(rows=1, cols=2)
n = i//2
fig.add_trace(go.Bar(y=df_retweet.head(i-n).index, x=df_retweet.head(i-n)['real_rt'],orientation='h',
name = 'All retweet',marker_color='#636EFA'), row=1, col=1)
fig.add_trace(go.Bar(y=df_retweet.head(i-n).index, x=df_retweet.head(i-n)['self_rt'],orientation='h',
name = 'Self retweet',marker_color='#EF553B'), row=1, col=1)
fig.add_trace(go.Bar(y=df_retweet.head(i).tail(n).index, x=df_retweet.head(i).tail(n)['real_rt'],orientation='h',
name = 'All retweet',marker_color='#636EFA'), row=1, col=2)
fig.add_trace(go.Bar(y=df_retweet.head(i).tail(n).index, x=df_retweet.head(i).tail(n)['self_rt'],orientation='h',
name = 'Self retweet',marker_color='#EF553B'), row=1, col=2)
fig.update_layout(title="The most %d frequent retweet account"%i)
fig.update_xaxes(title="Count of retweets")
fig.update_yaxes(title="Username")
fig.show()
df_replied = ut.get_df_raggruped(pd.DataFrame(df.loc[:,'in_reply_to_screen_name']),'reply_count','in_reply_to_screen_name')
df_replied = ut.add_user_type(df_replied,listNovax,listProvax,listLinkLow)
ut.print_histogram_users(df_replied,20,'reply_count','Most 20 replied users','Reply count')
retweet = sum(df_retweet['all_rt'])
reply = sum(df_replied['reply_count'])
tweet = len(df)-retweet-reply
values = [tweet,retweet,reply]
names = ['Tweets', 'Retweet','Reply']
fig = px.pie(values=values, names=names)
fig.update_layout(title="How are the tweets distribuited")
fig.update_traces(textinfo='value+percent')
fig.show()
#Creating a map of all mentioned users
listMention = []
for s in df['user_mentions']:
for val in eval(s):
listMention.append(val['screen_name'])
dfMentions = pd.DataFrame()
dfMentions['name'] = listMention
dfMentions['count'] = 0
dfMentions = dfMentions.groupby('name').count()
dfMentions.sort_values(['count'],axis = 0,inplace=True,ascending=False)
dfMentions = ut.add_user_type(dfMentions,listNovax,listProvax,listLinkLow)
dfMentions
| count | user_type | |
|---|---|---|
| name | ||
| RobertoBurioni | 52583 | Provax |
| Cartabellotta | 16004 | Provax |
| repubblica | 12689 | Provax |
| fattoquotidiano | 10496 | Provax |
| lucianocapone | 9501 | Provax |
| ... | ... | ... |
| SirAlexPlini | 1 | Not defined |
| SippeNazionale | 1 | Not defined |
| SionRomina | 1 | Not defined |
| SinopoliFra | 1 | Not defined |
| zziatizia | 1 | Not defined |
37795 rows × 2 columns
n = 20
#Show the most n mentioned user
ut.print_histogram_users(dfMentions,n,'count','Most %d mentioned users'%n,'Number of mention')
dfCountTweetCreation = df.resample('W', on='created_at').count().iloc[:,1].to_frame()
dfCountTweetCreation.rename(columns={"user_id": "count"},inplace=True)
fig = px.histogram(dfCountTweetCreation,x=dfCountTweetCreation.index,y='count',title='Tweets creation distribution'
,nbins=100)
fig.show()
df1 = df[df['user_url_cred'].notna()]
user_credibility = df1.loc[df1['user_url_cred']<1,'user_url_cred']
fig = px.histogram(x=user_credibility,
histnorm='percent',title="User credibility")
fig.show()